/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define M	$4
#define	N	$5
#define	K	$6
#define A	$9
#define B	$10
#define C	$11
#define LDC	$8

#define AO	$12
#define BO	$13

#define I	$2
#define J	$3
#define L	$7

#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17

#define OFFSET	$18
#define KK	$19
#define TEMP	$20
#define AORIG	$21

#define a1	$f0
#define a2	$f1
#define a3	$f26
#define a4	$f27

#define b1	$f2
#define b2	$f3
#define b3	$f4
#define b4	$f5
#define b5	$f6
#define b6	$f7
#define b7	$f8
#define b8	$f9

#define a5	b8

#define c11	$f10
#define c12	$f11
#define c21	$f12
#define c22	$f13
#define c31	$f14
#define c32	$f15
#define c41	$f16
#define c42	$f17
#define c51	$f18
#define c52	$f19
#define c61	$f20
#define c62	$f21
#define c71	$f22
#define c72	$f23
#define c81	$f24
#define c82	$f25

#ifndef CONJ
#define MADD1	  MADD
#define MADD2	  MADD
#define MADD3	  MADD
#define MADD4	  NMSUB
#define MADD5	  MSUB
#define MADD6	  MADD
#define MADD7	  NMSUB
#define MADD8	  MADD
#else
#if defined(LN) || defined(LT)
#define MADD1	  MADD
#define MADD2	  NMSUB
#define MADD3	  MADD
#define MADD4	  MADD
#else
#define MADD1	  MADD
#define MADD2	  MADD
#define MADD3	  NMSUB
#define MADD4	  MADD
#endif
#define MADD5	  MADD
#define MADD6	  MSUB
#define MADD7	  MADD
#define MADD8	  NMSUB
#endif

	PROLOGUE

	daddiu	$sp, $sp, -128

	SDARG	$16,   0($sp)
	SDARG	$17,   8($sp)
	SDARG	$18,  16($sp)
	SDARG	$19,  24($sp)
	SDARG	$20,  32($sp)
	SDARG	$21,  40($sp)

	sdc1	$f24, 48($sp)
	sdc1	$f25, 56($sp)
	sdc1	$f26, 64($sp)
	sdc1	$f27, 72($sp)

#ifndef __64BIT__
	sdc1	$f20, 88($sp)
	sdc1	$f21, 96($sp)
	sdc1	$f22,104($sp)
	sdc1	$f23,112($sp)
#endif

	LDARG	LDC,    128 + 0($sp)
	LDARG	OFFSET, 128 + 8($sp)

	dsll	LDC, LDC, ZBASE_SHIFT

#ifdef LN
	mult	M, K
	mflo	TEMP

	dsll	TEMP, TEMP, ZBASE_SHIFT
	daddu	A, A, TEMP

	dsll	TEMP, M, ZBASE_SHIFT
	daddu	C, C, TEMP
#endif

#ifdef RN
	neg	KK, OFFSET
#endif

#ifdef RT
	mult	N, K
	mflo	TEMP

	dsll	TEMP, TEMP, ZBASE_SHIFT
	daddu	B, B, TEMP

	mult	N, LDC
	mflo	TEMP
	daddu	C, C, TEMP

	dsubu	KK, N, OFFSET
#endif

	andi	J,  N, 1
	blez	J, .L20
	NOP

#ifdef RT
	dsll	TEMP, K, ZBASE_SHIFT
	dsubu	B, B, TEMP

	dsubu	C, C, LDC
#endif

	MTC	$0,  c11

	move	CO1, C

#ifdef LN
	daddu	KK, M, OFFSET
#endif

#ifdef LT
	move	KK, OFFSET
#endif

#if defined(LN) || defined(RT)
	move	AORIG, A
#else
	move	AO, A
#endif
#ifndef RT
	daddu	C,  CO1, LDC
#endif

	move	I,  M
	blez	I, .L39
	NOP
	.align 3

.L31:
#if defined(LT) || defined(RN)
	LD	a1,  0 * SIZE(AO)
	MOV	c21, c11
	LD	b1,  0 * SIZE(B)
	MOV	c31, c11
	LD	a2,  1 * SIZE(AO)

	MOV	c41, c11
	LD	b2,  1 * SIZE(B)
	MOV	c12, c11
	dsra	L,  KK, 2

	MOV	c22, c11
	LD	a3,  4 * SIZE(AO)
	MOV	c32, c11
	LD	b3,  4 * SIZE(B)

	NOP
	MOV	c42, c11
	blez	L, .L35
	move	BO,  B
#else
#ifdef LN
	dsll	TEMP,   K,  ZBASE_SHIFT
	dsubu	AORIG, AORIG, TEMP
#endif
	dsll	TEMP, KK, ZBASE_SHIFT

	daddu	AO, AORIG, TEMP
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	LD	a1,  0 * SIZE(AO)
	MOV	c21, c11
	LD	b1,  0 * SIZE(BO)
	MOV	c31, c11
	LD	a2,  1 * SIZE(AO)

	MOV	c41, c11
	LD	b2,  1 * SIZE(BO)
	MOV	c12, c11
	dsra	L, TEMP, 2

	MOV	c22, c11
	LD	a3,  4 * SIZE(AO)
	MOV	c32, c11
	LD	b3,  4 * SIZE(BO)

	blez	L, .L35
	MOV	c42, c11
#endif
	.align	3

.L32:
	MADD1	c11, c11, a1, b1
	LD	b4,  3 * SIZE(BO)
	MADD3	c21, c21, a1, b2
	LD	a1,  2 * SIZE(AO)
	MADD2	c12, c12, a2, b1
	LD	b1,  2 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	a2,  3 * SIZE(AO)

	MADD1	c11, c11, a1, b1
	LD	b2,  5 * SIZE(BO)
	MADD3	c21, c21, a1, b4
	LD	a1,  8 * SIZE(AO)
	MADD2	c12, c12, a2, b1
	LD	b1,  8 * SIZE(BO)
	MADD4	c22, c22, a2, b4
	LD	a2,  5 * SIZE(AO)

	MADD1	c11, c11, a3, b3
	LD	b4,  7 * SIZE(BO)
	MADD3	c21, c21, a3, b2
	LD	a3,  6 * SIZE(AO)
	MADD2	c12, c12, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	a2,  7 * SIZE(AO)

	MADD1	c11, c11, a3, b3
	LD	b2,  9 * SIZE(BO)
	MADD3	c21, c21, a3, b4
	LD	a3, 12 * SIZE(AO)
	MADD2	c12, c12, a2, b3
	LD	b3, 12 * SIZE(BO)
	MADD4	c22, c22, a2, b4
	LD	a2,  9 * SIZE(AO)

	daddiu	AO, AO,  8 * SIZE
	daddiu	L, L, -1

	bgtz	L, .L32
	daddiu	BO, BO,  8 * SIZE
	.align 3

.L35:
#if defined(LT) || defined(RN)
	andi	L, KK,  3
#else
	andi	L, TEMP, 3
#endif
	blez	L, .L38
	NOP
	.align	3

.L36:
	MADD1	c11, c11, a1, b1
	daddiu	L, L, -1
	MADD3	c21, c21, a1, b2
	LD	a1,  2 * SIZE(AO)
	MADD2	c12, c12, a2, b1
	LD	b1,  2 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	a2,  3 * SIZE(AO)

	LD	b2,  3 * SIZE(BO)
	daddiu	BO, BO,  2 * SIZE
	bgtz	L, .L36
	daddiu	AO, AO,  2 * SIZE

.L38:
 	ADD	c11, c11, c22
	ADD	c12, c12, c21

#if defined(LN) || defined(RT)
	daddiu	TEMP, KK, -1

	dsll	TEMP, TEMP, ZBASE_SHIFT
	daddu	AO, AORIG, TEMP
	daddu	BO, B,     TEMP
#endif

#if defined(LN) || defined(LT)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	SUB	c11, b1, c11
	SUB	c12, b2, c12
#else
	LD	b1,  0 * SIZE(AO)
	LD	b2,  1 * SIZE(AO)

	SUB	c11, b1, c11
	SUB	c12, b2, c12
#endif

#if defined(LN) || defined(LT)
	LD	b1,  0 * SIZE(AO)
	LD	b2,  1 * SIZE(AO)

	MUL	a1, b2, c12
	MUL	a2, b2, c11

	MADD5	c11, a1, b1, c11
	MADD6	c12, a2, b1, c12
#endif

#if defined(RN) || defined(RT)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MUL	a1, b2, c12
	MUL	a2, b2, c11

	MADD5	c11, a1, b1, c11
	MADD6	c12, a2, b1, c12
#endif

#if defined(LN) || defined(LT)
	ST	c11,  0 * SIZE(BO)
	ST	c12,  1 * SIZE(BO)
#else
	ST	c11,  0 * SIZE(AO)
	ST	c12,  1 * SIZE(AO)
#endif

#ifdef LN
	daddiu	CO1,CO1, -2 * SIZE
#endif

	ST	c11,  0 * SIZE(CO1)
	ST	c12,  1 * SIZE(CO1)

#ifndef LN
	daddiu	CO1,CO1, 2 * SIZE
#endif

	MTC	$0,  c11

#ifdef RT
	dsll	TEMP, K, ZBASE_SHIFT
	daddu	AORIG, AORIG, TEMP
#endif

#if defined(LT) || defined(RN)
	dsubu	TEMP, K, KK
	dsll	TEMP, TEMP, ZBASE_SHIFT
	daddu	AO, AO, TEMP
	daddu	BO, BO, TEMP
#endif

#ifdef LT
	daddiu	KK, KK, 1
#endif

#ifdef LN
	daddiu	KK, KK, -1
#endif

	daddiu	I, I, -1

	bgtz	I, .L31
	NOP
	.align 3

.L39:
#ifdef LN
	dsll	TEMP, K, ZBASE_SHIFT
	daddu	B, B, TEMP
#endif

#if defined(LT) || defined(RN)
	move	B,  BO
#endif

#ifdef RN
	daddiu	KK, KK,  1
#endif

#ifdef RT
	daddiu	KK, KK, -1
#endif
	.align 3

.L20:
	andi	J,  N, 2
	blez	J, .L30
	NOP

#ifdef RT
	dsll	TEMP, K, 1 + ZBASE_SHIFT
	dsubu	B, B, TEMP

	dsll	TEMP, LDC, 1
	dsubu	C, C, TEMP
#endif

	MTC	$0,  c11

	move	CO1, C
	daddu	CO2, C,   LDC

#ifdef LN
	daddu	KK, M, OFFSET
#endif

#ifdef LT
	move	KK, OFFSET
#endif

#if defined(LN) || defined(RT)
	move	AORIG, A
#else
	move	AO, A
#endif
#ifndef RT
	daddu	C,  CO2, LDC
#endif

	move	I,  M
	blez	I, .L29
	NOP
	.align 3

.L21:
#if defined(LT) || defined(RN)
	LD	a1,  0 * SIZE(AO)
	MOV	c21, c11
	LD	b1,  0 * SIZE(B)
	MOV	c31, c11
	LD	a3,  4 * SIZE(AO)
	MOV	c41, c11
	LD	b2,  1 * SIZE(B)
	dsra	L,  KK, 2

	LD	b3,  2 * SIZE(B)
	MOV	c12, c11
	LD	b4,  3 * SIZE(B)
	MOV	c22, c11
	LD	b5,  4 * SIZE(B)
	MOV	c32, c11

	NOP
	MOV	c42, c11
	blez	L, .L25
	move	BO,  B
#else
#ifdef LN
	dsll	TEMP,   K,  ZBASE_SHIFT
	dsubu	AORIG, AORIG, TEMP
#endif

	dsll	L,    KK, ZBASE_SHIFT
	dsll	TEMP, KK, 1 + ZBASE_SHIFT

	daddu	AO, AORIG, L
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	LD	a1,  0 * SIZE(AO)
	MOV	c21, c11
	LD	b1,  0 * SIZE(BO)
	MOV	c31, c11
	LD	a3,  4 * SIZE(AO)
	MOV	c41, c11
	LD	b2,  1 * SIZE(BO)
	dsra	L,  TEMP, 2

	LD	b3,  2 * SIZE(BO)
	MOV	c12, c11
	LD	b4,  3 * SIZE(BO)
	MOV	c22, c11
	LD	b5,  4 * SIZE(BO)
	MOV	c32, c11

	blez	L, .L25
	MOV	c42, c11
#endif
	.align	3

.L22:
	MADD1	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD3	c21, c21, a1, b2
	daddiu	L, L, -1
	MADD1	c31, c31, a1, b3
	NOP
	MADD3	c41, c41, a1, b4
	LD	a1,  2 * SIZE(AO)

	MADD2	c12, c12, a2, b1
	LD	b1,  8 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD1	c11, c11, a1, b5
	LD	a2,  3 * SIZE(AO)
	MADD3	c21, c21, a1, b2
	NOP
	MADD1	c31, c31, a1, b3
	NOP
	MADD3	c41, c41, a1, b4
	LD	a1,  8 * SIZE(AO)

	MADD2	c12, c12, a2, b5
	LD	b5, 12 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 11 * SIZE(BO)

	MADD1	c11, c11, a3, b1
	LD	a2,  5 * SIZE(AO)
	MADD3	c21, c21, a3, b2
	NOP
	MADD1	c31, c31, a3, b3
	NOP
	MADD3	c41, c41, a3, b4
	LD	a3,  6 * SIZE(AO)

	MADD2	c12, c12, a2, b1
	LD	b1, 16 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD1	c11, c11, a3, b5
	LD	a2,  7 * SIZE(AO)
	MADD3	c21, c21, a3, b2
	daddiu	AO, AO,  8 * SIZE
	MADD1	c31, c31, a3, b3
	NOP
	MADD3	c41, c41, a3, b4
	LD	a3,  4 * SIZE(AO)

	MADD2	c12, c12, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 17 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 18 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 19 * SIZE(BO)

	bgtz	L, .L22
	daddiu	BO, BO, 16 * SIZE
	.align 3

.L25:
#if defined(LT) || defined(RN)
	andi	L, KK,  3
#else
	andi	L, TEMP, 3
#endif
	blez	L, .L28
	NOP
	.align	3

.L26:
	MADD1	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD3	c21, c21, a1, b2
	daddiu	L, L, -1
	MADD1	c31, c31, a1, b3
	daddiu	BO, BO,  4 * SIZE
	MADD3	c41, c41, a1, b4
	LD	a1,  2 * SIZE(AO)

	MADD2	c12, c12, a2, b1
	LD	b1,  0 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4,  3 * SIZE(BO)

	bgtz	L, .L26
	daddiu	AO, AO,  2 * SIZE

.L28:
 	ADD	c11, c11, c22
	ADD	c12, c12, c21
	ADD	c31, c31, c42
	ADD	c32, c32, c41

#if defined(LN) || defined(RT)
#ifdef LN
	daddiu	TEMP, KK, -1
#else
	daddiu	TEMP, KK, -2
#endif

	dsll	L,    TEMP, ZBASE_SHIFT
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT
	daddu	AO, AORIG, L
	daddu	BO, B,     TEMP
#endif

#if defined(LN) || defined(LT)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	c11, b1, c11
	SUB	c12, b2, c12
	SUB	c31, b3, c31
	SUB	c32, b4, c32
#else
	LD	b1,  0 * SIZE(AO)
	LD	b2,  1 * SIZE(AO)
	LD	b3,  2 * SIZE(AO)
	LD	b4,  3 * SIZE(AO)

	SUB	c11, b1, c11
	SUB	c12, b2, c12
 	SUB	c31, b3, c31
	SUB	c32, b4, c32
#endif

#if defined(LN) || defined(LT)
	LD	b1,  0 * SIZE(AO)
	LD	b2,  1 * SIZE(AO)

	MUL	a1, b2, c12
	MUL	a2, b2, c11
	MUL	a3, b2, c32
	MUL	a4, b2, c31

	MADD5	c11, a1, b1, c11
	MADD6	c12, a2, b1, c12
	MADD5	c31, a3, b1, c31
	MADD6	c32, a4, b1, c32
#endif

#ifdef RN
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MUL	a1, b2, c12
	MUL	a2, b2, c11

	MADD5	c11, a1, b1, c11
	MADD6	c12, a2, b1, c12

	NMSUB	c31, c31, b3, c11
	MADD7	c32, c32, b4, c11

	MADD8	c31, c31, b4, c12
	NMSUB	c32, c32, b3, c12

	LD	b3,  6 * SIZE(BO)
	LD	b4,  7 * SIZE(BO)

	MUL	a1, b4, c32
	MUL	a2, b4, c31

	MADD5	c31, a1, b3, c31
	MADD6	c32, a2, b3, c32
#endif

#ifdef RT
	LD	b5,  6 * SIZE(BO)
	LD	b6,  7 * SIZE(BO)
	LD	b7,  4 * SIZE(BO)
	LD	b8,  5 * SIZE(BO)

	MUL	a1, b6, c32
	MUL	a2, b6, c31

	MADD5	c31, a1, b5, c31
	MADD6	c32, a2, b5, c32

	NMSUB	c11, c11, b7, c31
	MADD7	c12, c12, b8, c31

	MADD8	c11, c11, b8, c32
	NMSUB	c12, c12, b7, c32

	LD	b7,  0 * SIZE(BO)
	LD	b8,  1 * SIZE(BO)

	MUL	a1, b8, c12
	MUL	a2, b8, c11

	MADD5	c11, a1, b7, c11
	MADD6	c12, a2, b7, c12
#endif

#if defined(LN) || defined(LT)
	ST	c11,  0 * SIZE(BO)
	ST	c12,  1 * SIZE(BO)
	ST	c31,  2 * SIZE(BO)
	ST	c32,  3 * SIZE(BO)
#else
	ST	c11,  0 * SIZE(AO)
	ST	c12,  1 * SIZE(AO)
	ST	c31,  2 * SIZE(AO)
	ST	c32,  3 * SIZE(AO)
#endif

#ifdef LN
	daddiu	CO1,CO1, -2 * SIZE
	daddiu	CO2,CO2, -2 * SIZE
#endif

	ST	c11,  0 * SIZE(CO1)
	ST	c12,  1 * SIZE(CO1)
	ST	c31,  0 * SIZE(CO2)
	ST	c32,  1 * SIZE(CO2)

#ifndef LN
	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE
#endif

	MTC	$0,  c11

#ifdef RT
	dsll	TEMP, K, ZBASE_SHIFT
	daddu	AORIG, AORIG, TEMP
#endif

#if defined(LT) || defined(RN)
	dsubu	TEMP, K, KK
	dsll	L,    TEMP, ZBASE_SHIFT
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LT
	daddiu	KK, KK, 1
#endif

#ifdef LN
	daddiu	KK, KK, -1
#endif

	daddiu	I, I, -1

	bgtz	I, .L21
	NOP
	.align 3

.L29:
#ifdef LN
	dsll	TEMP, K, 1 + ZBASE_SHIFT
	daddu	B, B, TEMP
#endif

#if defined(LT) || defined(RN)
	move	B,  BO
#endif

#ifdef RN
	daddiu	KK, KK,  2
#endif

#ifdef RT
	daddiu	KK, KK, -2
#endif
	.align 3

.L30:
	dsra	J,  N, 2
	blez	J, .L999
	nop

.L10:
#ifdef RT
	dsll	TEMP, K, 2 + ZBASE_SHIFT
	dsubu	B, B, TEMP

	dsll	TEMP, LDC, 2
	dsubu	C, C, TEMP
#endif

	move	CO1, C
	MTC	$0,  c11
	daddu	CO2, C,   LDC
	daddu	CO3, CO2, LDC
	daddiu	J, J, -1
	daddu	CO4, CO3, LDC
	MOV	c21, c11
	MOV	c31, c11
	MOV	c41, c11
	MOV	c51, c11
	move	I,  M

#ifdef LN
	daddu	KK, M, OFFSET
#endif

#ifdef LT
	move	KK, OFFSET
#endif

#if defined(LN) || defined(RT)
	move	AORIG, A
#else
	move	AO, A
#endif
#ifndef RT
	daddu	C,  CO4, LDC
#endif

	blez	I, .L19
	MOV	c61, c11
	.align 3

.L11:
#if defined(LT) || defined(RN)
	LD	a1,  0 * SIZE(AO)
	MOV	c71, c11
	LD	b1,  0 * SIZE(B)
	MOV	c81, c11

	LD	a3,  4 * SIZE(AO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(B)
	MOV	c22, c11

	dsra	L,  KK, 2
	MOV	c32, c11
	LD	b3,  2 * SIZE(B)
	MOV	c42, c11

	LD	b4,  3 * SIZE(B)
	MOV	c52, c11
	LD	b5,  4 * SIZE(B)
	MOV	c62, c11

	LD	b6,  8 * SIZE(B)
	MOV	c72, c11
	LD	b7, 12 * SIZE(B)
	MOV	c82, c11

	blez	L, .L15
	move	BO,  B
#else
#ifdef LN
	dsll	TEMP,   K,  ZBASE_SHIFT
	dsubu	AORIG, AORIG, TEMP
#endif

	dsll	L,    KK, ZBASE_SHIFT
	dsll	TEMP, KK, 2 + ZBASE_SHIFT

	daddu	AO, AORIG, L
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	LD	a1,  0 * SIZE(AO)
	MOV	c71, c11
	LD	b1,  0 * SIZE(BO)
	MOV	c81, c11

	LD	a3,  4 * SIZE(AO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(BO)
	MOV	c22, c11

	dsra	L,  TEMP, 2
	MOV	c32, c11
	LD	b3,  2 * SIZE(BO)
	MOV	c42, c11

	LD	b4,  3 * SIZE(BO)
	MOV	c52, c11
	LD	b5,  4 * SIZE(BO)
	MOV	c62, c11

	LD	b6,  8 * SIZE(BO)
	MOV	c72, c11
	LD	b7, 12 * SIZE(BO)
	MOV	c82, c11

	blez	L, .L15
	NOP
#endif

	MADD1	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD3	c21, c21, a1, b2
	daddiu	L, L, -1
	MADD1	c31, c31, a1, b3
	NOP
	blez	L, .L13
	MADD3	c41, c41, a1, b4
	.align	3

.L12:
	MADD2	c12, c12, a2, b1
	LD	b1, 16 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD1	c51, c51, a1, b5
	NOP
	MADD3	c61, c61, a1, b2
	LD	a4,  2 * SIZE(AO)
	MADD1	c71, c71, a1, b3
	NOP
	MADD3	c81, c81, a1, b4
	LD	a1,  8 * SIZE(AO)

	MADD2	c52, c52, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4, 11 * SIZE(BO)

	MADD1	c11, c11, a4, b6
	LD	a2,  3 * SIZE(AO)
	MADD3	c21, c21, a4, b2
	NOP
	MADD1	c31, c31, a4, b3
	NOP
	MADD3	c41, c41, a4, b4
	NOP

	MADD2	c12, c12, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD1	c51, c51, a4, b7
	NOP
	MADD3	c61, c61, a4, b2
	NOP
	MADD1	c71, c71, a4, b3
	NOP
	MADD3	c81, c81, a4, b4
	NOP

	MADD2	c52, c52, a2, b7
	LD	b7, 28 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2, 17 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3, 18 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4, 19 * SIZE(BO)

	MADD1	c11, c11, a3, b1
	LD	a2,  5 * SIZE(AO)
	MADD3	c21, c21, a3, b2
	NOP
	MADD1	c31, c31, a3, b3
	NOP
	MADD3	c41, c41, a3, b4
	NOP

	MADD2	c12, c12, a2, b1
	LD	b1, 32 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 21 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 22 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 23 * SIZE(BO)

	MADD1	c51, c51, a3, b5
	NOP
	MADD3	c61, c61, a3, b2
	LD	a4,  6 * SIZE(AO)
	MADD1	c71, c71, a3, b3
	NOP
	MADD3	c81, c81, a3, b4
	LD	a3, 12 * SIZE(AO)

	MADD2	c52, c52, a2, b5
	LD	b5, 36 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2, 25 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3, 26 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4, 27 * SIZE(BO)

	MADD1	c11, c11, a4, b6
	LD	a2,  7 * SIZE(AO)
	MADD3	c21, c21, a4, b2
	NOP
	MADD1	c31, c31, a4, b3
	NOP
	MADD3	c41, c41, a4, b4
	daddiu	L, L, -1

	MADD2	c12, c12, a2, b6
	LD	b6, 40 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 29 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 30 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 31 * SIZE(BO)

	MADD1	c51, c51, a4, b7
	daddiu	BO, BO, 32 * SIZE
	MADD3	c61, c61, a4, b2
	daddiu	AO, AO,  8 * SIZE
	MADD1	c71, c71, a4, b3
	NOP
	MADD3	c81, c81, a4, b4
	NOP

	MADD2	c52, c52, a2, b7
	LD	b7, 12 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4,  3 * SIZE(BO)

	MADD1	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD3	c21, c21, a1, b2
	NOP
	MADD1	c31, c31, a1, b3
	NOP
	bgtz	L, .L12
	MADD3	c41, c41, a1, b4
	.align 3

.L13:
	MADD2	c12, c12, a2, b1
	LD	b1, 16 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD1	c51, c51, a1, b5
	NOP
	MADD3	c61, c61, a1, b2
	LD	a4,  2 * SIZE(AO)
	MADD1	c71, c71, a1, b3
	NOP
	MADD3	c81, c81, a1, b4
	LD	a1,  8 * SIZE(AO)

	MADD2	c52, c52, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4, 11 * SIZE(BO)

	MADD1	c11, c11, a4, b6
	LD	a2,  3 * SIZE(AO)
	MADD3	c21, c21, a4, b2
	NOP
	MADD1	c31, c31, a4, b3
	NOP
	MADD3	c41, c41, a4, b4
	NOP

	MADD2	c12, c12, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD1	c51, c51, a4, b7
	NOP
	MADD3	c61, c61, a4, b2
	NOP
	MADD1	c71, c71, a4, b3
	NOP
	MADD3	c81, c81, a4, b4
	NOP

	MADD2	c52, c52, a2, b7
	LD	b7, 28 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2, 17 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3, 18 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4, 19 * SIZE(BO)

	MADD1	c11, c11, a3, b1
	LD	a2,  5 * SIZE(AO)
	MADD3	c21, c21, a3, b2
	NOP
	MADD1	c31, c31, a3, b3
	NOP
	MADD3	c41, c41, a3, b4
	NOP

	MADD2	c12, c12, a2, b1
	LD	b1, 32 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 21 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 22 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 23 * SIZE(BO)

	MADD1	c51, c51, a3, b5
	NOP
	MADD3	c61, c61, a3, b2
	LD	a4,  6 * SIZE(AO)
	MADD1	c71, c71, a3, b3
	NOP
	MADD3	c81, c81, a3, b4
	LD	a3, 12 * SIZE(AO)

	MADD2	c52, c52, a2, b5
	LD	b5, 36 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2, 25 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3, 26 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4, 27 * SIZE(BO)

	MADD1	c11, c11, a4, b6
	LD	a2,  7 * SIZE(AO)
	MADD3	c21, c21, a4, b2
	NOP
	MADD1	c31, c31, a4, b3
	NOP
	MADD3	c41, c41, a4, b4
	NOP

	MADD2	c12, c12, a2, b6
	LD	b6, 40 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2, 29 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3, 30 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4, 31 * SIZE(BO)

	MADD1	c51, c51, a4, b7
	daddiu	BO, BO, 32 * SIZE
	MADD3	c61, c61, a4, b2
	daddiu	AO, AO,  8 * SIZE
	MADD1	c71, c71, a4, b3
	NOP
	MADD3	c81, c81, a4, b4
	NOP

	MADD2	c52, c52, a2, b7
	LD	b7, 12 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	LD	b4,  3 * SIZE(BO)
	.align 3

.L15:
#if defined(LT) || defined(RN)
	andi	L, KK,  3
#else
	andi	L, TEMP, 3
#endif
	blez	L, .L18
	NOP
	.align	3

.L16:
	MADD1	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD3	c21, c21, a1, b2
	NOP
	MADD1	c31, c31, a1, b3
	NOP
	MADD3	c41, c41, a1, b4
	NOP

	MADD2	c12, c12, a2, b1
	LD	b1,  8 * SIZE(BO)
	MADD4	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD2	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD4	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD1	c51, c51, a1, b5
	daddiu	L, L, -1
	MADD3	c61, c61, a1, b2
	daddiu	AO, AO,  2 * SIZE
	MADD1	c71, c71, a1, b3
	daddiu	BO, BO,  8 * SIZE
	MADD3	c81, c81, a1, b4
	LD	a1,  0 * SIZE(AO)

	MADD2	c52, c52, a2, b5
	LD	b5,  4 * SIZE(BO)
	MADD4	c62, c62, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD2	c72, c72, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD4	c82, c82, a2, b4
	bgtz	L, .L16
	LD	b4,  3 * SIZE(BO)

.L18:
 	ADD	c11, c11, c22
	ADD	c12, c12, c21
	ADD	c31, c31, c42
	ADD	c32, c32, c41

	ADD	c51, c51, c62
	ADD	c52, c52, c61
	ADD	c71, c71, c82
	ADD	c72, c72, c81

#if defined(LN) || defined(RT)
#ifdef LN
	daddiu	TEMP, KK, -1
#else
	daddiu	TEMP, KK, -4
#endif

	dsll	L,    TEMP, ZBASE_SHIFT
	dsll	TEMP, TEMP, 2 + ZBASE_SHIFT
	daddu	AO, AORIG, L
	daddu	BO, B,     TEMP
#endif

#if defined(LN) || defined(LT)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	SUB	c11, b1, c11
	SUB	c12, b2, c12
	SUB	c31, b3, c31
	SUB	c32, b4, c32
	SUB	c51, b5, c51
	SUB	c52, b6, c52
 	SUB	c71, b7, c71
	SUB	c72, b8, c72

#else
	LD	b1,  0 * SIZE(AO)
	LD	b2,  1 * SIZE(AO)
	LD	b3,  2 * SIZE(AO)
	LD	b4,  3 * SIZE(AO)
	LD	b5,  4 * SIZE(AO)
	LD	b6,  5 * SIZE(AO)
	LD	b7,  6 * SIZE(AO)
	LD	b8,  7 * SIZE(AO)

	SUB	c11, b1, c11
	SUB	c12, b2, c12
 	SUB	c31, b3, c31
	SUB	c32, b4, c32
	SUB	c51, b5, c51
	SUB	c52, b6, c52
	SUB	c71, b7, c71
	SUB	c72, b8, c72
#endif

#if defined(LN) || defined(LT)
	LD	b1,  0 * SIZE(AO)
	LD	b2,  1 * SIZE(AO)

	MUL	a1, b2, c12
	MUL	a2, b2, c11
	MUL	a3, b2, c32
	MUL	a4, b2, c31

	MADD5	c11, a1, b1, c11
	MADD6	c12, a2, b1, c12
	MADD5	c31, a3, b1, c31
	MADD6	c32, a4, b1, c32

	MUL	a1, b2, c52
	MUL	a2, b2, c51
	MUL	a3, b2, c72
	MUL	a4, b2, c71

	MADD5	c51, a1, b1, c51
	MADD6	c52, a2, b1, c52
	MADD5	c71, a3, b1, c71
	MADD6	c72, a4, b1, c72
#endif

#ifdef RN
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MUL	a1, b2, c12
	MUL	a2, b2, c11

	MADD5	c11, a1, b1, c11
	MADD6	c12, a2, b1, c12

	NMSUB	c31, c31, b3, c11
	MADD7	c32, c32, b4, c11
	NMSUB	c51, c51, b5, c11
	MADD7	c52, c52, b6, c11
	NMSUB	c71, c71, b7, c11
	MADD7	c72, c72, b8, c11

	MADD8	c31, c31, b4, c12
	NMSUB	c32, c32, b3, c12
	MADD8	c51, c51, b6, c12
	NMSUB	c52, c52, b5, c12
	MADD8	c71, c71, b8, c12
	NMSUB	c72, c72, b7, c12

	LD	b3, 10 * SIZE(BO)
	LD	b4, 11 * SIZE(BO)
	LD	b5, 12 * SIZE(BO)
	LD	b6, 13 * SIZE(BO)
	LD	b7, 14 * SIZE(BO)
	LD	b8, 15 * SIZE(BO)

	MUL	a1, b4, c32
	MUL	a2, b4, c31

	MADD5	c31, a1, b3, c31
	MADD6	c32, a2, b3, c32

	NMSUB	c51, c51, b5, c31
	MADD7	c52, c52, b6, c31
	NMSUB	c71, c71, b7, c31
	MADD7	c72, c72, b8, c31

	MADD8	c51, c51, b6, c32
	NMSUB	c52, c52, b5, c32
	MADD8	c71, c71, b8, c32
	NMSUB	c72, c72, b7, c32

	LD	b5, 20 * SIZE(BO)
	LD	b6, 21 * SIZE(BO)
	LD	b7, 22 * SIZE(BO)
	LD	b8, 23 * SIZE(BO)

	MUL	a1, b6, c52
	MUL	a2, b6, c51

	MADD5	c51, a1, b5, c51
	MADD6	c52, a2, b5, c52

	NMSUB	c71, c71, b7, c51
	MADD7	c72, c72, b8, c51

	MADD8	c71, c71, b8, c52
	NMSUB	c72, c72, b7, c52

	LD	b7, 30 * SIZE(BO)
	LD	b8, 31 * SIZE(BO)

	MUL	a1, b8, c72
	MUL	a2, b8, c71

	MADD5	c71, a1, b7, c71
	MADD6	c72, a2, b7, c72
#endif

#ifdef RT
	LD	b1, 30 * SIZE(BO)
	LD	b2, 31 * SIZE(BO)
	LD	b3, 28 * SIZE(BO)
	LD	b4, 29 * SIZE(BO)
	LD	b5, 26 * SIZE(BO)
	LD	b6, 27 * SIZE(BO)
	LD	b7, 24 * SIZE(BO)
	LD	b8, 25 * SIZE(BO)

	MUL	a1, b2, c72
	MUL	a2, b2, c71

	MADD5	c71, a1, b1, c71
	MADD6	c72, a2, b1, c72

	NMSUB	c51, c51, b3, c71
	MADD7	c52, c52, b4, c71
	NMSUB	c31, c31, b5, c71
	MADD7	c32, c32, b6, c71
	NMSUB	c11, c11, b7, c71
	MADD7	c12, c12, b8, c71

	MADD8	c51, c51, b4, c72
	NMSUB	c52, c52, b3, c72
	MADD8	c31, c31, b6, c72
	NMSUB	c32, c32, b5, c72
	MADD8	c11, c11, b8, c72
	NMSUB	c12, c12, b7, c72

	LD	b3, 20 * SIZE(BO)
	LD	b4, 21 * SIZE(BO)
	LD	b5, 18 * SIZE(BO)
	LD	b6, 19 * SIZE(BO)
	LD	b7, 16 * SIZE(BO)
	LD	b8, 17 * SIZE(BO)

	MUL	a1, b4, c52
	MUL	a2, b4, c51

	MADD5	c51, a1, b3, c51
	MADD6	c52, a2, b3, c52

	NMSUB	c31, c31, b5, c51
	MADD7	c32, c32, b6, c51
	NMSUB	c11, c11, b7, c51
	MADD7	c12, c12, b8, c51

	MADD8	c31, c31, b6, c52
	NMSUB	c32, c32, b5, c52
	MADD8	c11, c11, b8, c52
	NMSUB	c12, c12, b7, c52

	LD	b5, 10 * SIZE(BO)
	LD	b6, 11 * SIZE(BO)
	LD	b7,  8 * SIZE(BO)
	LD	b8,  9 * SIZE(BO)

	MUL	a1, b6, c32
	MUL	a2, b6, c31

	MADD5	c31, a1, b5, c31
	MADD6	c32, a2, b5, c32

	NMSUB	c11, c11, b7, c31
	MADD7	c12, c12, b8, c31

	MADD8	c11, c11, b8, c32
	NMSUB	c12, c12, b7, c32

	LD	b7,  0 * SIZE(BO)
	LD	b8,  1 * SIZE(BO)

	MUL	a1, b8, c12
	MUL	a2, b8, c11

	MADD5	c11, a1, b7, c11
	MADD6	c12, a2, b7, c12
#endif

#if defined(LN) || defined(LT)
	ST	c11,  0 * SIZE(BO)
	ST	c12,  1 * SIZE(BO)
	ST	c31,  2 * SIZE(BO)
	ST	c32,  3 * SIZE(BO)
	ST	c51,  4 * SIZE(BO)
	ST	c52,  5 * SIZE(BO)
	ST	c71,  6 * SIZE(BO)
	ST	c72,  7 * SIZE(BO)
#else
	ST	c11,  0 * SIZE(AO)
	ST	c12,  1 * SIZE(AO)
	ST	c31,  2 * SIZE(AO)
	ST	c32,  3 * SIZE(AO)
	ST	c51,  4 * SIZE(AO)
	ST	c52,  5 * SIZE(AO)
	ST	c71,  6 * SIZE(AO)
	ST	c72,  7 * SIZE(AO)
#endif

#ifdef LN
	daddiu	CO1,CO1, -2 * SIZE
	daddiu	CO2,CO2, -2 * SIZE
	daddiu	CO3,CO3, -2 * SIZE
	daddiu	CO4,CO4, -2 * SIZE
#endif

	ST	c11,  0 * SIZE(CO1)
	ST	c12,  1 * SIZE(CO1)
	ST	c31,  0 * SIZE(CO2)
	ST	c32,  1 * SIZE(CO2)
	ST	c51,  0 * SIZE(CO3)
	ST	c52,  1 * SIZE(CO3)
	ST	c71,  0 * SIZE(CO4)
	ST	c72,  1 * SIZE(CO4)

#ifndef LN
	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE
	daddiu	CO3,CO3, 2 * SIZE
	daddiu	CO4,CO4, 2 * SIZE
#endif


#ifdef RT
	dsll	TEMP, K, ZBASE_SHIFT
	daddu	AORIG, AORIG, TEMP
#endif

#if defined(LT) || defined(RN)
	dsubu	TEMP, K, KK
	dsll	L,    TEMP, ZBASE_SHIFT
	dsll	TEMP, TEMP, 2 + ZBASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LT
	daddiu	KK, KK, 1
#endif

#ifdef LN
	daddiu	KK, KK, -1
#endif

	MTC	$0,  c11

	daddiu	I, I, -1


	MOV	c21, c11
	MOV	c31, c11
	MOV	c41, c11
	MOV	c51, c11

	bgtz	I, .L11
	MOV	c61, c11
	.align 3

.L19:
#ifdef LN
	dsll	TEMP, K, 2 + ZBASE_SHIFT
	daddu	B, B, TEMP
#endif

#if defined(LT) || defined(RN)
	move	B,  BO
#endif

#ifdef RN
	daddiu	KK, KK,  4
#endif

#ifdef RT
	daddiu	KK, KK, -4
#endif

	bgtz	J, .L10
	NOP
	.align 3

.L999:
	LDARG	$16,   0($sp)
	LDARG	$17,   8($sp)
	LDARG	$18,  16($sp)
	LDARG	$19,  24($sp)
	LDARG	$20,  32($sp)
	LDARG	$21,  40($sp)

	ldc1	$f24, 48($sp)
	ldc1	$f25, 56($sp)
	ldc1	$f26, 64($sp)
	ldc1	$f27, 72($sp)

#ifndef __64BIT__
	ldc1	$f20, 88($sp)
	ldc1	$f21, 96($sp)
	ldc1	$f22,104($sp)
	ldc1	$f23,112($sp)
#endif

	j	$31
	daddiu	$sp, $sp, 128

	EPILOGUE
